

# Author: Ivan Canay | Northwestern University | March 2023
# Copyright (c) 2023 Northwestern University. All rights reserved.
#-------------------------------------------------------------------
# X-plain: 	This file has several function in R that are used in
#			"Inference with non-ignorable cluster sizes"
#-------------------------------------------------------------------
# Last update: March 1, 2023 includes 
#   - changed in various functions to accomodate CAR
#   - also changes to replace the zeta distribution with a t-dist
#-------------------------------------------------------------------

require("Matrix", quietly = TRUE);
require("VGAM", quietly = TRUE);
require("extraDistr", quietly = TRUE);
require(sandwich, quietly = TRUE)
require(lmtest, quietly = TRUE)
library(tictoc, quietly=TRUE)
options(warn=-1)

#-------------------------------------------------------------------
gen.cluster.sizes <-function(G,max.support,s=1.5){
#-------------------------------------------------------------------
# X-plain: Draws G clusters with a maximum sample size Nmax given Z
#-------------------------------------------------------------------
# INPUTS: - G: the number of clusters
#         - max.support: n-1 in the beta binomial | Nmax = 10*(max.support +1)  
#         - s is the dof for the t-distribution in sample.4
#------------------------------------------------------------------
# RETURNS: 4 designs for sample sizes for G clusters. 
#          The output is a (G x 4) matrix 
# plot(grid,dbbinom(grid,1000, alpha = 10, beta = 90))
#------------------------------------------------------------------
# Setup sample sizes per cluster

# Design 1: Beta binomial uniform
sample.1 = 10*(rbbinom(G,max.support, alpha = 1, beta = 1)+1);
sample.2 = 10*(rbbinom(G,max.support, alpha = 0.4, beta = 0.4)+1);
sample.3 = 10*(rbbinom(G,max.support, alpha = 10, beta = 50)+1);
sample.4 = round(10*abs(rt(G,s)))+10;

samples = matrix(c(sample.1,sample.2,sample.3,sample.4),G,4)

# Return the data frame
    return(samples);
}

#-------------------------------------------------------------------
gen.cluster.samples <-function(cluster.sizes){
#-------------------------------------------------------------------
# X-plain: Draws samples from each clusters (M)
#-------------------------------------------------------------------
# INPUTS: - the output of gen.cluster.sizes
#------------------------------------------------------------------
# RETURNS: 3 designs of samples (all, same, proportional)
#          for each of the 4 cases in cluster.sizes 
#          The output is a list with 4 matrices of dimension (G x 3) 
#------------------------------------------------------------------
dimen = dim(cluster.sizes);
# Design 1: all
sample.a = cluster.sizes;
# Design 2: same
sample.b = matrix(c(10),dimen[1],dimen[2])
# Design 3: all if 10 or less, and 40% otherwise but not exceeding 200. 
sample.c = (cluster.sizes<=20)*cluster.sizes + (cluster.sizes>20)*pmax(20,pmin(.4*cluster.sizes,200)); 

# return samples in a list
 return(list(all=sample.a,same=sample.b,prop=sample.c))
}

#-------------------------------------------------------------------
gen.car <-function(Zg2,v.Ng,v.Mg,G,car.design=0,pa=0.5){
#-------------------------------------------------------------------
# X-plain: Generates individual treatment assignment
#-------------------------------------------------------------------
# INPUTS: - Zg2 is the beta covariates. 
#         - v.Ng is the vector containing obs. for each cluster
#         - v.Mg is the vector containing sampled obs. for each cluster
#         - G: number of clusters
#         - car.design: 0 (srs), 1 (car on Zg2), 2 (car of both Zg1/2) 
#------------------------------------------------------------------
# RETURNS: a vector of individual level treatment assignments
#------------------------------------------------------------------
num.strata=10;
cluster.A = cbind(rep(0,G)); 

if (car.design==0){# Do simple random sampling
        A    = rep(runif(G)>0.5,v.Mg);
        I.S  = matrix(1,G,1);
    } else if (car.design==1){ # CAR on Zg2 only with 10 strata
        bounds = seq(-2.25,2.25,length.out = num.strata+1); # careful with bounds
        I.S    = matrix(0,G,num.strata);
        for (s in 1:num.strata){
            I.S[,s] = (Zg2>bounds[s])*(Zg2<=bounds[s+1]);
        }
        for (k in 1:num.strata){
            index    = which(I.S[,k]==1);
            ns       = length(index);
            # pick a random permutation of 1s and 0s. 
            cluster.A[index] = sample(c(rep(1,floor(pa*ns)),rep(0,ns-floor(pa*ns))));
        }    
        A = rep(cluster.A,v.Mg);
    } else if (car.design==2){# CAR on Zg2 and Zg1 with 10 = 5x2 strata
        bounds = seq(-2.25,2.25,length.out = num.strata/2+1); # careful with bounds
        I.S    = matrix(0,G,num.strata);
        for (s in 1:num.strata){
            strat.Ng = 2*(v.Ng>median(v.Ng))-1; # +1 for big and -1 for small Ng
            j = ceiling(s/2);
            sign = 2*(s %% 2) -1;
            I.S[,s] =  (Zg2>bounds[j])*(Zg2<=bounds[j+1])*(strat.Ng*sign==1);
        }
        for (k in 1:num.strata){
            index    = which(I.S[,k]==1);
            ns       = length(index);
            # pick a random permutation of 1s and 0s. 
            cluster.A[index] = sample(c(rep(1,floor(pa*ns)),rep(0,ns-floor(pa*ns))));
        }    
        A = rep(cluster.A,v.Mg);
    }     

# Return a vector of individual level treatment assignments and strata
    return(list(A=A,I.S=I.S))
}

#-------------------------------------------------------------------
gen.data <-function(cluster.samples,design,car.design,mu.1=0,pa=1/2,type="agg",sigma1=sqrt(2),gamma=1){
#-------------------------------------------------------------------
# X-plain: Generates potential outcomes and treament assignment
#-------------------------------------------------------------------
# INPUTS: - the output of cluster.samples
#         - Design takes two values 1 for i.i.d and 0 for Z depending in Ng
#         - car.design takes values 0,1,2 (see function gen.car)
#         - pa is the prob of treatment assignment (see function gen.car)
#         - mu.1 is the mean of Yig(1). Under the null: mu.1=0
#         - type determines is we get aggregated data (agg) or both agg and individual data (ind)
#         - sigma1 is the sd of the error term of Yig(1) (sigma0 is set to 1)
#         - gamma is the coefficient multiplying Zg2 
#------------------------------------------------------------------
# RETURNS: a list with 12 data sets of (G Ng Mg Ng/g Y D Y1 Y0 I.S) 
#------------------------------------------------------------------
dimen = dim(cluster.samples$all);
G = dimen[1]; cases.Ng = dimen[2];
cases.Mg = length(cluster.samples);
total.cases = cases.Mg*cases.Ng;
Ng = matrix(cluster.samples[[1]],G,cases.Ng); # the first sample equals Ng;
case=0;

if (type=="agg"){
    data_list = vector(mode = "list", length = total.cases);
} else if (type=="ind"){
    data_list = vector(mode = "list", length = 2*total.cases);
}

for (j in 1:cases.Mg){
    Mg = matrix(cluster.samples[[j]],G,cases.Ng);
    
    for (k in 1:cases.Ng){
    case = case+1;
    cluster.indicator = rep(c(1:G),Mg[,k]);
    total.sample = length(cluster.indicator);
    
    #### cluster level random variables ####
    eta1.g = runif(G); eta2.g.1 = 5*runif(G); eta2.g.0 = runif(G); 
    #### Desing for Zg-1 
    if (design==1){
        Zg1 = 2*(runif(G)>0.5)-1;
    } else if (design==2){
        # Indicates if cities are in the north. More likely if they are big
        Zg.big   = 2*(runif(G)>0.25)-1;
        Zg.small = 2*(runif(G)>0.75)-1;
        threshold = mean(Ng[,k]);
        Zg1 = Zg.big*(Ng[,k]>=threshold)+Zg.small*(Ng[,k]<threshold);
    } 
    Zg2 = sqrt(20)*(rbeta(G,2,2)-1/2);
    #### m.1 and m.0 for the variable Zg2
    m1.g = gamma*Zg2;
    m0.g = gamma*(-((log(Zg2+3))*(Zg2<1/2)-0.5602138));

    #### Individual level random variables ####
    # treatment assignment
    cardata = gen.car(Zg2,Ng[,k],Mg[,k],G,car.design,pa);
    A = cardata$A;
    I.S = cardata$I.S;

    # error terms
    epsilon.ig.1 = rnorm(total.sample,0,sigma1);
    epsilon.ig.0 = rnorm(total.sample);
     # potential outcomes 
    Yig.0 = rep(eta2.g.0*Zg1 + m0.g,Mg[,k]) + epsilon.ig.0;
    Yig.1 = rep(mu.1*eta1.g + eta2.g.1*Zg1 + m1.g,Mg[,k]) + epsilon.ig.1;
    Y = Yig.1*A + Yig.0*(1-A);
    # full data of Y and A, individual data:
    fulldata = matrix(0,total.sample,8);
    fulldata[,1] = cluster.indicator;
    fulldata[,2] = rep(Ng[,k],Mg[,k]);
    fulldata[,3] = rep(Mg[,k],Mg[,k]);
    fulldata[,4] = fulldata[,2]/fulldata[,3];
    fulldata[,5] = Y;
    fulldata[,6] = A;
    fulldata[,7] = Yig.1;
    fulldata[,8] = Yig.0;
    colnames(fulldata) <- c("cluster","Ng","Mg","ratio","Yg","Ag","Yg.1","Yg.0");

    agg.data=aggregate(x = fulldata[,5:8],   # Specify data columns
          by = list(fulldata[,1]),           # Specify group indicator
          FUN = mean)                        # Specify function (i.e. mean)

    colnames(agg.data) <- c("cluster","Yg","Ag","Yg.1","Yg.0");

    agg.data$Ng = Ng[,k];
    agg.data$Mg = Mg[,k];
    agg.data$ratio  = Ng[,k]/Mg[,k];
    agg.data$strata = I.S;
     
    # Putting all together in a matrix with columeng G Ng Mg sqrt(Ng/Mg) Y A Y1 Y0
    if (type=="agg"){
        data_list[[case]] = agg.data;
    } else if (type=="ind"){
        data_list[[case]] = agg.data;
        data_list[[total.cases+case]] = fulldata;
    }
}
}
# return `total.cases' (12) of data sets 
 return(data_list)
}

#-------------------------------------------------------------------
est.theta.reg <-function(data){
#-------------------------------------------------------------------
# X-plain: computes theta hat 1 and 2 using regression
#-------------------------------------------------------------------
# INPUTS: - Requires individual data so type="ind" in gen.data
#------------------------------------------------------------------
# RETURNS: a list with 12 estimators of theta1 and theta2
#------------------------------------------------------------------

# with individual data, the first 12 data sets are aggregated 
# we then use data sets 13 to 24. 
cases = length(data);
hat.theta = matrix(0,cases/2,2); 
st.dev = matrix(0,cases/2,2)
start = 1+ cases/2;

for (j in start:cases){
    mydata=as.data.frame(data[[j]]);
    agg.data=aggregate(x = mydata[,4:6],     # Specify data column (Ng/Mg,Y,A)
          by = list(mydata[,1]),             # Specify group indicator
          FUN = mean)                        # Specify function (i.e. mean)
    reg1 = lm(Yg~Ag,agg.data);
    index = j-cases/2;
    hat.theta[index,1] = reg1$coeff[2];
    #reg2 = lm(Yg~Ag,mydata,weight=ratio);   # This is NOT good for standard errors
    reg2 = lm(sqrt(ratio)*Yg~sqrt(ratio)+I(sqrt(ratio)*Ag)-1,mydata);
    hat.theta[index,2] = reg2$coeff[2];

    # Standard Errors: Theta 1
    st.dev[index,1] = coeftest(reg1, vcov = vcovHC(reg1,type="HC1"))[2,2];

    # Standard Errors: Theta 2
    st.dev[index,2]= coeftest(reg2,vcov = vcovCL,type = "HC1",cluster = mydata$cluster)[2,2];
    
    # alternative version below that is MUCH slower. 
    #  Ntot = dim(mydata)[1];
    #  Xw = as.matrix(cbind(ones=sqrt(mydata$V4),A=mydata$V6*sqrt(mydata$V4)));
    # st.dev[j,3]= sqrt(cce.stata(Xw,reg2,mydata$V1,dof="stata")[2,2]);
}
    return(list(hat.theta=hat.theta,st.dev=st.dev))
}

#-------------------------------------------------------------------
est.theta.diff <-function(data,G,car.design=0,pa=1/2,type="agg"){
#-------------------------------------------------------------------
# X-plain: computes theta hat 1 and 2 using formulas in paper
#-------------------------------------------------------------------
# INPUTS: - the output of gen.data 
#         - G: number of clusters
#         - car.design takes values 0,1,2 (see function gen.car)
#         - type determines is we get aggregated data (agg) or both agg and individual data (ind)
#------------------------------------------------------------------
# RETURNS: a list with 12 estimators of theta1 and theta2
#------------------------------------------------------------------
if (type=="agg"){
    cases = length(data);    
} else if (type=="ind"){
    cases = length(data)/2;
}
if (car.design==0){
    tau.s = pa*(1-pa);
} else{
    tau.s = 0;
}

hat.theta = matrix(0,cases,2);
st.dev = matrix(0,cases,2)

for (j in 1:cases){
    mydata=as.data.frame(data[[j]]);
    cluster = mydata$cluster;
    Yg = mydata$Yg;
    Ag = mydata$Ag;
    Ng = mydata$Ng;
    I.S= mydata$strata;

    #Theta 1:
    mu.G.1.Y = sum(Yg*Ag)/sum(Ag);
    mu.G.0.Y = sum(Yg*(1-Ag))/sum(1-Ag); 
    hat.theta[j,1] = mu.G.1.Y - mu.G.0.Y; 
    #Theta 2:
    hat.theta[j,2] = sum(Yg*Ag*Ng)/sum(Ag*Ng) - sum(Yg*(1-Ag)*Ng)/sum((1-Ag)*Ng);  

    # (Theta 1) Estimation of variance (pieces)
    mu.G.1.Y2  = sum(Yg^2*Ag)/sum(Ag);
    mu.G.0.Y2  = sum(Yg^2*(1-Ag))/sum((1-Ag));
    mu.G.1.Y.s = colSums(Yg*I.S*Ag)/max(colSums(Ag*I.S),0.1);
    mu.G.0.Y.s = colSums(Yg*I.S*(1-Ag))/max(colSums((1-Ag)*I.S),0.1);
    G.s        = colSums((I.S>0));

    # (Theta 1) Estimation of variance (three terms)
    zeta.Y = (1/pa)*(mu.G.1.Y2-sum((1/G)*G.s*mu.G.1.Y.s^2)) + (1/(1-pa))*(mu.G.0.Y2-sum((1/G)*G.s*mu.G.0.Y.s^2));
    zeta.H = sum((1/G)*G.s*((mu.G.1.Y.s-mu.G.1.Y)-(mu.G.0.Y.s-mu.G.0.Y))^2);
    zeta.A = tau.s*sum((1/G)*G.s*((1/pa)*(mu.G.1.Y.s-mu.G.1.Y)+(1/(1-pa))*(mu.G.0.Y.s-mu.G.0.Y))^2);

    hat.sigma.sq.1 = zeta.Y + zeta.H + zeta.A;
    st.dev[j,1] = sqrt(hat.sigma.sq.1)/sqrt(G);

    # (Theta 2) Estimation of variance (pieces)
    h.Yg = (Ng/mean(Ng))*(Yg- mean(Yg*Ag*Ng)/mean(Ag*Ng));

    mu.G.1.hY = sum(h.Yg*Ag)/sum(Ag);
    mu.G.0.hY = sum(h.Yg*(1-Ag))/sum(1-Ag); 
    mu.G.1.hY2  = sum(h.Yg^2*Ag)/sum(Ag);
    mu.G.0.hY2  = sum(h.Yg^2*(1-Ag))/sum((1-Ag));
    mu.G.1.hY.s = colSums(h.Yg*I.S*Ag)/max(colSums(Ag*I.S),0.1);
    mu.G.0.hY.s = colSums(h.Yg*I.S*(1-Ag))/max(colSums((1-Ag)*I.S),0.1);

    # Estimation of variance for THETA 2 (three terms)
    xi.Y = (1/pa)*(mu.G.1.hY2-sum((1/G)*G.s*mu.G.1.hY.s^2)) + (1/(1-pa))*(mu.G.0.hY2-sum((1/G)*G.s*mu.G.0.hY.s^2));
    xi.H = sum((1/G)*G.s*((mu.G.1.hY.s-mu.G.1.hY)-(mu.G.0.Y.s-mu.G.0.hY))^2);
    xi.A = tau.s*sum((1/G)*G.s*((1/pa)*(mu.G.1.hY.s-mu.G.1.hY)+(1/(1-pa))*(mu.G.0.hY.s-mu.G.0.hY))^2);

    hat.sigma.sq.2 = xi.Y + xi.H + xi.A;
    st.dev[j,2] = sqrt(hat.sigma.sq.2)/sqrt(G);

}
    return(list(hat.theta=hat.theta,st.dev=st.dev))
}

#-------------------------------------------------------------------
pop.theta <-function(max.support,design){
#-------------------------------------------------------------------
# X-plain: computes the population values of theta1 and theta2
#-------------------------------------------------------------------
# INPUTS: - max.support in (19,49,99) and design in (1,2)
#------------------------------------------------------------------
# RETURNS: a list of 4 values of theta1 and theta2
#------------------------------------------------------------------

if (design==1){
    theta= matrix(0,4,2);
} else {
    if (max.support==19){
    theta=matrix(c(0,0,-0.2160,-0.5516,0.4763,0.6279,0.1375,0.1050),4,2);
    } else if (max.support==49){
    theta=matrix(c(0,0,-0.1407,-0.5516,0.4900,0.6581,0.1625,0.1050),4,2);    
    } else if (max.support==99){
    theta=matrix(c(0,0,-0.0635,-0.5516,0.4950,0.6690,0.2100,0.1050),4,2);        
    }
}

return(theta)
}
#------------------------------------------------------------------
# END OF FILE
#------------------------------------------------------------------

#-------------------------------------------------------------------
pop.theta.2 <-function(huge.data,max.support){
#-------------------------------------------------------------------
# X-plain: computes the population values of theta1 and theta2
#-------------------------------------------------------------------
# INPUTS: - a huge sample of clusters from the DGP
#------------------------------------------------------------------
# RETURNS: a list of 4 values of theta1 and theta2
#------------------------------------------------------------------
theta = matrix(0,4,2);
tot.obs = dim(huge.data)[1];
Mean.vector = matrix(0,4,1);
s=1.5;

#Mean.vector = colMeans(huge.data);
Mean.vector[1] = 10*(max.support*1/(1+1)+1);
Mean.vector[2] = 10*(max.support*0.4/(0.4+0.4)+1);
Mean.vector[3] = 10*(max.support*10/(10+50)+1);
Mean.vector[4] = 10*(2*sqrt(s/pi)*gamma((s+1)/2)/(gamma(s/2)*(s-1)))+10;

# Analytical expression for the 4th case
#mean.t = 2*sqrt(s/pi)*gamma((s+1)/2)/(gamma(s/2)*(s-1))
#2*pht(round(mean.t),s, lower.tail = FALSE)-1

# THETA 1
E.Ng = matrix(rep(Mean.vector,each=tot.obs),tot.obs,4);
P.Ng = colSums((huge.data>=E.Ng))/tot.obs;
theta[,1] = 2*P.Ng -1;

# THETA 2
trunc.above = colSums((huge.data)*(huge.data>=E.Ng))/colSums((huge.data>=E.Ng));
trunc.below = colSums((huge.data)*(huge.data<E.Ng))/colSums((huge.data<E.Ng));
theta[,2] = (1/colMeans(huge.data))*trunc.above*P.Ng-(1/colMeans(huge.data))*trunc.below*(1-P.Ng);

return(theta)
}
#------------------------------------------------------------------




